library(plotly)
library(rgl)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(ppcor)
library(GGally)
library(corrplot)
library(corrplot)
library(utf8)
library(arules)
library(arulesViz)
load("/home/andresfaral/Dropbox/LABO CERO/pelis")
dim(peliculas.df)
## [1] 242528 13
peliculas.df %>% arrange(-Votes*Year)
## Warning in -Votes * Year: NAs produced by integer overflow
resu<-peliculas.df %>% filter(Year<2019) %>% group_by(Year) %>% summarise(Cantidad=n())
ggplot(data = resu, aes(x = Year, y = Cantidad)) +
geom_point(color = 'red')
Empecemos por filtrar aquellas peliculas con mas de 200 votos. Luego me fijo cuantas filas y columnas me quedaron
peliculas.df = peliculas.df[which(peliculas.df$Votes > 200),]
dim(peliculas.df)
## [1] 50488 13
Ahora veo cuales son los gĆ©neros posibles. Como en el campo Genre puedo tener mas de uno separado por coma, primero creo una funcion que me los separa con este criterio, luego le aplico esa funcion a cada elemento Ćŗnico en el campo genres. Luego āaplastoā la lista de listas resultante para tener un array de una sola dimension. Finalmente elimino los lagging spaces y vuelvo a obtener los valores Ćŗnicos.
split_genre <-function(genres){strsplit(as.character(genres), ",")}
genres = sapply(unique(peliculas.df$Genre), split_genre,simplify = "array")
generos = unique(trimws(unlist(genres)))
generos = generos [-22]
for (i in 1:length(generos)) {
peliculas.df[generos[i]] = as.integer(as.logical(grepl(generos[i], trimws(peliculas.df$Genre))))
}
print(generos)
## [1] "Adventure" "Comedy" "War" "Biography" "Western" "Romance"
## [7] "Animation" "Drama" "Family" "Fantasy" "Action" "Sci-Fi"
## [13] "Music" "Thriller" "Horror" "History" "Mystery" "Crime"
## [19] "Sport" "Musical" "Adult" "Game-Show" "News"
Grafico de Barras
cant.gen<-as.data.frame(apply(peliculas.df[,14:36],2,sum))
cant.gen<-cant.gen %>% mutate(Genro=rownames(cant.gen))
names(cant.gen)<-c("Cantidad","Genero")
cant.gen<-cant.gen %>% arrange(Cantidad)
ggplot(data=cant.gen, aes(x=reorder(Genero,Cantidad),y=Cantidad, fill = Genero)) +
geom_bar(stat="identity")+
scale_colour_gradient2()+
coord_flip()+
scale_x_discrete(limits = cant.gen$TGenero)+
theme_classic() + labs(x="Genero")
Vaeamos ahora como se relacionan entre sà los géneros.
datos0<-peliculas.df[c(generos)]
datos0<-datos0==1
trans <- as(datos0, "transactions")
# Criterios
soporte<-0.05 # 0.2
confianza<-0.3 # 0.4
longitud.min<-2 # 2
longitud.max<-2 # 2
# Busco reglas
reglas <- apriori(trans,
parameter = list(supp=soporte,
conf=confianza,
minlen=longitud.min,
maxlen=longitud.max))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.05 2
## maxlen target ext
## 2 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2524
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 50488 transaction(s)] done [0.01s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2
## Warning in apriori(trans, parameter = list(supp = soporte, conf = confianza, :
## Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!
## done [0.00s].
## writing ... [6 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspectDT(reglas)
plot(reglas, method="graph",interactive = T,engine = "htmlwidget")
## Warning in plot.rules(reglas, method = "graph", interactive = T, engine =
## "htmlwidget"): The parameter interactive is deprecated. Use engine='interactive'
## instead.
MƔs Reglas
datos0<-peliculas.df[c(generos)]
datos0<-datos0==1
trans <- as(datos0, "transactions")
# Criterios
soporte<-0.01 # 0.2
confianza<-0.3 # 0.4
longitud.min<-2 # 2
longitud.max<-2 # 2
# Busco reglas
reglas <- apriori(trans,
parameter = list(supp=soporte,
conf=confianza,
minlen=longitud.min,
maxlen=longitud.max))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 2 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 504
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 50488 transaction(s)] done [0.01s].
## sorting and recoding items ... [19 item(s)] done [0.00s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2
## Warning in apriori(trans, parameter = list(supp = soporte, conf = confianza, :
## Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!
## done [0.00s].
## writing ... [28 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspectDT(reglas)
plot(reglas, method="graph",interactive = T,engine = "htmlwidget")
## Warning in plot.rules(reglas, method = "graph", interactive = T, engine =
## "htmlwidget"): The parameter interactive is deprecated. Use engine='interactive'
## instead.
Dos generos: Drama (ROJO) Horror (CELESTE)
#
subbase<-peliculas.df %>% dplyr::filter(Horror==1|Drama==1)
evol.Dra<-subbase %>% dplyr::filter(Drama==1) %>% group_by(Year) %>% summarise(DurMed=mean(Runtime,na.rm=T),RatMed=mean(Rating,na.rm=T))
evol.Hor<-subbase %>% dplyr::filter(Horror==1) %>% group_by(Year) %>% summarise(DurMed=mean(Runtime,na.rm=T),RatMed=mean(Rating,na.rm=T))
RATING<-subbase$Rating
TIEMPO<-subbase$Year
DURACION<-subbase$Runtime
TITULO<-subbase$Title
#y<-log(subbase$Votes)
tama<-log(subbase$Votes)/10
colores<-ifelse(subbase$Drama==1,"red","cyan")
quegen<-ifelse(subbase$Drama==1,"Drama","Horror")
base.plot<-data.frame(Rating=RATING,Duracion=DURACION,Genero=quegen,Titulo=TITULO)
plot_ly(base.plot, x = ~Rating, y = ~Duracion, color = ~Genero, type = 'scatter', mode = 'markers',
text = ~Titulo,colors=c("red","cyan"))
## Warning: `arrange_()` was deprecated in dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## Warning: Ignoring 327 observations
GrƔfico Contour
ggplot(base.plot, aes(Rating, Duracion, color = Genero)) +
geom_point(shape = 16, size = 0.5, show.legend = FALSE,alpha=0.4) +
theme_minimal() + coord_cartesian(ylim=c(0,200))+ stat_density2d(geom = "polygon",alpha=0.2)
## Warning: Removed 327 rows containing non-finite values (stat_density2d).
## Warning: Removed 327 rows containing missing values (geom_point).
#
ggplot(base.plot, aes(Rating, Duracion, color = Genero)) + theme_minimal() + coord_cartesian(ylim=c(70,165))+ stat_density2d(geom = "polygon",alpha=0.1) + scale_alpha_continuous(range = c(0, .1))
## Warning: Removed 327 rows containing non-finite values (stat_density2d).
Relacion entre Año, Duración y Rating Plot 3D
plot3d(RATING,TIEMPO,DURACION,col=colores,zlim=c(60,180),type="p",size=3)
lines3d(evol.Dra$RatMed,evol.Dra$Year,evol.Dra$DurMed,col="black",lwd=5)
lines3d(evol.Hor$RatMed,evol.Hor$Year,evol.Hor$DurMed,col="black",lwd=6)
get_1genre <-function(genres){ return( trimws(unlist(strsplit(as.character(genres), ","))[1]))}
peliculas.df$Principal_Genre = unlist(sapply(peliculas.df$Genre, get_1genre,simplify = "array"))
peliculas.rec<-peliculas.df %>% dplyr::select(Year,Rating,Runtime,Principal_Genre) %>% na.omit()
calif<-peliculas.rec %>% group_by(Year,Principal_Genre) %>% summarise(Rating=mean(Rating),Duracion=mean(Runtime),Cant=n()) %>% filter(Principal_Genre=="Action"|Principal_Genre=="Drama"|Principal_Genre=="Horror"|Principal_Genre=="Comedy") %>% na.omit()
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
r3 <- plot_ly(calif, x = ~`Duracion`, y = ~`Rating`,
frame=~Year,size= ~`Cant`,color=~`Principal_Genre`,colors=c("blue","green","red","cyan"),mode="markers",marker = list(symbol = 'circle', sizemode = 'diameter',
line = list(width = 2, color = '#FFFFFF'), opacity=0.75)) %>% animation_opts(
1000, redraw = FALSE
)
r3
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter